#   LibAut
#       outcome regressions: data preparation, simple
#   Juraj Medzihorsky
#   2021-08-10

library(parallel)
options(mc.cores=3)
options(stringsAsFactors=F)
source('helpers.R')

#   sessionInfo()
##  R version 3.6.3 (2020-02-29)
##  Platform: x86_64-pc-linux-gnu (64-bit)
##  Running under: Ubuntu 16.04.7 LTS
##
##  Matrix products: default
##  BLAS/LAPACK: /opt/OpenBLAS/lib/libopenblas_haswellp-r0.3.9.so
##
##  locale:
##   [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C               LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8
##   [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8    LC_PAPER=en_GB.UTF-8       LC_NAME=C
##   [9] LC_ADDRESS=C               LC_TELEPHONE=C             LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
##  attached base packages:
##  [1] parallel  stats     graphics  grDevices utils     datasets  methods   base
##
##  other attached packages:
##  [1] nvimcom_0.9-28 colorout_1.2-0
##
##  loaded via a namespace (and not attached):
##  [1] compiler_3.6.3 tools_3.6.3

#------------------------------------------------------------------------------

code_dir <- getwd()
data_dir <- gsub('code$', 'data', code_dir)

setwd(data_dir)
ert <- read.csv('eps.csv')
vdem <- readRDS('V-Dem-CY-Full+Others-v10.rds')
setwd(code_dir)

#------------------------------------------------------------------------------

v <- subset(vdem, year>1899)

#   dem_ep_prch
#       0   not in an ep with dem transition potential
#       1   in an ep with potential for a dem transition
#   dem_ep_ptr: 
#       0   not in an a dem transition episode
#       1   in an dem transition episode

#   with(ert, table('potential'=dem_ep_prch, 'transition'=dem_ep_ptr, useNA='a'))

e <- subset(ert, (dem_ep_prch%in%1) & (dem_ep_ptr%in%0))
d <- merge(v, e)

#   create excl region edi
d$excl_region_edi <- as.numeric(rep(NA,nrow(d)))
for (i in 1:nrow(d)) {
    econd1 <- d$e_regionpol_6C%in%d$e_regionpol_6C[i]
    econd2 <- !(d$country_text_id%in%d$country_text_id[i])
    d$excl_region_edi[i] <- mean(d$v2x_polyarchy[econd1&econd2], na.rm=T)
}
gc()

#   create excl region edi
v$excl_region_edi <- as.numeric(rep(NA,nrow(v)))
for (i in 1:nrow(v)) {
    econd1 <- v$e_regionpol_6C%in%v$e_regionpol_6C[i]
    econd2 <- !(v$country_text_id%in%v$country_text_id[i])
    v$excl_region_edi[i] <- mean(v$v2x_polyarchy[econd1&econd2], na.rm=T)
}
gc()


#------------------------------------------------------------------------------
#    ONLY EPISODES

E <- d[!is.na(d$dem_ep_id), ]
EL <- split(E, f=(E$dem_ep_id)) 

getstartcondition <-
    function(x)
    {
        dem_ep_id <- x$dem_ep_id[1]
        final_outcome <- x$dem_ep_outcome[nrow(x)]
        data.frame(dem_ep_id, final_outcome)
    }

epframe <- do.call(rbind, lapply(EL, getstartcondition))

#   table(E$dem_ep_outcome, useNA='a')
#   table(epframe$final_outcome, useNA='a')

#   recode outcomes
#   epy: suc 1 fail 0
epframe$epy <- as.numeric(rep(NA,nrow(epframe)))
#   success: 1, 5
epframe$epy[epframe$final_outcome%in%c(1,5)] <- 1
#   fail: 2,3,4
epframe$epy[epframe$final_outcome%in%c(2,3,4)] <- 0
#   censored: 6
epframe$epy[epframe$final_outcome%in%6] <- NA

#   with(epframe, table(final_outcome, epy, useNA='a'))

#------------------------------------------------------------------------------
#   find pre-ep values in vdem

vars_to_pre <-
    c('excl_region_edi',
      'v2x_polyarchy',
      'v2x_regime',
      dname('gdp', d),
      'e_mipopula',
      'v2csprtcpt',
      'v2pepwrsoc',
      'e_miinteco',
      'e_miinterc',
      'v2xnp_regcorr',
      'v2xnp_pres',
      'v2xnp_client',
      'v2x_neopat',
      'v2xps_party',
      'v2xeg_eqdr',
      'year',
      'e_regionpol_6C')

#   vars_to_pre %in% names(v)

findpre <-
    function(epid, vars=vars_to_pre, d=v)
    {
        e <- as.character(epid)
        ctr <- substr(e, 1, 3)
        beg <- as.numeric(substr(e, 5, 8))
        pre <- beg-1
        out <- d[(d$year%in%pre)&(d$country_text_id%in%ctr), vars]
        return(out)
    }

epvars <- do.call(rbind, mclapply(epframe$dem_ep_id, findpre))
epvars$logpop <- log(epvars$e_mipopula)
gc()


names(epvars) <- ifelse((1:21)%in%c(19,20), names(epvars), paste0('lag1_', names(epvars)))

eps <- cbind(epframe, epvars)


#   dump what's missing on these
regv <- c('epy',
          'year',
          'lag1_v2x_polyarchy',
          'lag1_excl_region_edi',
          'lag1_e_miinterc',
          'lag1_e_miinteco',
          'lag1_e_migdppcln',
          'lag1_e_migdpgro',
          'lag1_e_mipopula',
          'lag1_v2csprtcpt',
          'lag1_v2pepwrsoc',
          'lag1_v2xeg_eqdr',
          'lag1_v2xnp_pres')

#   na_fil <- rowSums(is.na(eps[, regv]))<1
dat <- eps

#   table(dat$epy, useNA='a')
##  371     226 F 145 S

dx <- lapply(dat, function(x) as.numeric(as.character(x)))
dx$dem_ep_id <- dat$dem_ep_id
dy <- as.data.frame(dx)
dat <- dy

#-------------------------------------------------------------------------------
#   save

setwd(data_dir)
saveRDS(dat, 'outcome_simple_20210810.rds') 

#   SCRIPT END